## Setup, Load Libraries & Report Versions
knitr::opts_chunk$set(echo = TRUE)
get_os <- function(){
sysinf <- Sys.info()
if (!is.null(sysinf)){
os <- sysinf['sysname']
if (os == 'Darwin')
os <- "osx"
} else { ## mystery machine
os <- .Platform$OS.type
if (grepl("^darwin", R.version$os))
os <- "osx"
if (grepl("linux-gnu", R.version$os))
os <- "linux"
if (grepl("mingw", R.version$os))
os <- "windows"
}
tolower(os)
}
if (get_os()=="windows"){
home_folder = "Z:~/viallr01/"
}else{
home_folder = "~/"
}
## Load Libraries -------------------
## IMPORTANT!
## Please check the paths and files in the file "load_packages_and_functions.R" before running this!
## All .R files must be in the same folder, specified below in 'lib_folder'
lib_folder = paste0(home_folder,"ad-omics/ricardo/MyRepo/structuralvariation/R_Library/")
source(paste0(lib_folder,"load_packages_and_functions.R"))QC results for two ROSMAP samples sequenced using PacBio CLR (Sequel 2)
p_load(magick)
results_path = paste0(home_folder,"ad-omics/ricardo/ROSMAP_LongRead/SV_Analysis/00_QC/raw/")
ggdraw() + draw_image(paste0(results_path,"SM_CJEK6","/HistogramReadlength.png"), scale = 1)stats = read.delim2(paste0(results_path,"SM_CJEK6","/NanoStats.txt"))
data.frame(Summary = gsub("(.*):(.*)","\\1",stats[1:7,]), Value = gsub("(.*):(.*)","\\2",stats[1:7,]) %>% trim.spaces())results_path = paste0(home_folder,"ad-omics/ricardo/ROSMAP_LongRead/SV_Analysis/00_QC/raw/")
ggdraw() + draw_image(paste0(results_path,"SM_CJK3B","/HistogramReadlength.png"), scale = 1)stats = read.delim2(paste0(results_path,"SM_CJK3B","/NanoStats.txt"))
data.frame(Summary = gsub("(.*):(.*)","\\1",stats[1:7,]), Value = gsub("(.*):(.*)","\\2",stats[1:7,]) %>% trim.spaces())samples.id = c("SM_CJEK6","SM_CJK3B")
results_path = paste0(home_folder,"ad-omics/ricardo/ROSMAP_LongRead/SV_Analysis/")
p_load(magick)
plot_sample <- list()
for (sampleID in samples.id){
p1 <- ggdraw() + draw_image(paste0(results_path,"01_mapping/h37/",sampleID,".svim/sv-genotypes-q5.png"), scale = 1)
p2 <- ggdraw() + draw_image(paste0(results_path,"01_mapping/h37/",sampleID,".svim/sv-lengths-q5.png"), scale = 1)
title <- ggdraw() +
draw_label(
sampleID,
fontface = 'bold',
x = 0,
hjust = 0
) + theme(plot.margin = margin(0, 0, 0, 7))
plot_row <- plot_grid(p1, p2)
plot_sample[[sampleID]] <- plot_grid(title, plot_row,
ncol = 1,
rel_heights = c(0.1, 1))
}
plot_grid(plotlist = plot_sample, ncol = 1)bench_dir = paste0(home_folder,"ad-omics/ricardo/ROSMAP_LongRead/ShortRead_results/final_gt/")
svtypes = c("DEL","DUP","INS","INV")
## INVs are mixed with complex SVs. We aggregate measures for INVs + different complex types
# ml R/3.6.2 bedtools bcftools picard samtools parallel
# samples=( "SM_CJK3B" "SM_CJEK6" )
# svtypes=( "DUP_INV" "DEL_INV" "DEL_DUP_INV" "DISDUP" )
# for sample in "${samples[@]}"; do
# for svtype in "${svtypes[@]}"; do
# sed "s/INV/${svtype}/g" ${sample}.gt.INV.filt.bed > ${sample}.gt.${svtype}.filt.bed
# echo "rm -rf ${sample}_vapor_${svtype}; vapor bed --sv-input ${sample}.gt.${svtype}.filt.bed --output-path ${sample}_vapor_${svtype}/ --reference ~/ad-omics/ricardo/Data/bayestyper_GRCh37_bundle_v1.3/GRCh37_canon.fa --pacbio-input ~/ad-omics/ricardo/ROSMAP_LongRead/ShortRead_results/${sample}.mm2.sorted.bam" | bsub -n 1 -R 'span[hosts=1]' -R 'rusage[mem=4000]' -P acc_ad-omics -W 12:00 -oo ${sample}_vapor_${svtype}.out -eo ${sample}_vapor_${svtype}.err
# done
# done
# SM_CJK3B
vapor_res = list()
for (svtype in svtypes){
vapor_tmp0 = read.table(paste0(bench_dir,"SM_CJK3B.gt.",svtype,".filt.bed.vapor"), header = T)
if(svtype == "INV"){
vapor_tmp_cpx = vapor_tmp0 %>% mutate(conf = VaPoR_GS>0 | VaPoR_GT!="0/0")
cpx_types = c("DUP_INV","DEL_INV","DEL_DUP_INV","DISDUP")
for(cpx_type in cpx_types){
vapor_tmp_cpx0 = read.table(paste0(bench_dir,"SM_CJK3B.gt.",cpx_type,".filt.bed.vapor"), header = T)
vapor_tmp_cpx1 = vapor_tmp_cpx0 %>% mutate(conf = VaPoR_GS>0 | VaPoR_GT!="0/0")
vapor_tmp_cpx$conf = vapor_tmp_cpx$conf | vapor_tmp_cpx1$conf
}
vapor_tmp_cpx$VaPoR_GS[vapor_tmp_cpx$conf] <- 1
vapor_tmp_cpx$VaPoR_GT[vapor_tmp_cpx$conf] <- "0/1"
vapor_res[[svtype]] = vapor_tmp_cpx
}else{
vapor_tmp0$conf = T
vapor_res[[svtype]] = vapor_tmp0
}
}
df_vapor = data.frame(Sample = "SM_CJK3B", SVTYPE = names(vapor_res), N = sapply(vapor_res, nrow))
df_conf_rate = Reduce(full_join, vapor_res) %>% drop_na() %>% group_by(SVTYPE) %>% dplyr::summarise(Sample = "SM_CJK3B", avg = (sum(VaPoR_GS>0 | VaPoR_GT!="0/0")/length(VaPoR_GS)), n = n())
df_conf_rate = rbind(df_conf_rate , Reduce(full_join, vapor_res) %>% drop_na() %>% dplyr::summarise(Sample = "SM_CJK3B", SVTYPE = "ALL", avg = (sum(VaPoR_GS>0 | VaPoR_GT!="0/0")/length(VaPoR_GS)), n = n()))
# SM_CJEK6
vapor_res = list()
for (svtype in svtypes){
vapor_tmp0 = read.table(paste0(bench_dir,"SM_CJEK6.gt.",svtype,".filt.bed.vapor"), header = T)
if(svtype == "INV"){
vapor_tmp_cpx = vapor_tmp0 %>% mutate(conf = VaPoR_GS>0 | VaPoR_GT!="0/0")
cpx_types = c("DUP_INV","DEL_INV","DEL_DUP_INV","DISDUP")
for(cpx_type in cpx_types){
vapor_tmp_cpx0 = read.table(paste0(bench_dir,"SM_CJEK6.gt.",cpx_type,".filt.bed.vapor"), header = T)
vapor_tmp_cpx1 = vapor_tmp_cpx0 %>% mutate(conf = VaPoR_GS>0 | VaPoR_GT!="0/0")
vapor_tmp_cpx$conf = vapor_tmp_cpx$conf | vapor_tmp_cpx1$conf
}
vapor_tmp_cpx$VaPoR_GS[vapor_tmp_cpx$conf] <- 1
vapor_tmp_cpx$VaPoR_GT[vapor_tmp_cpx$conf] <- "0/1"
vapor_res[[svtype]] = vapor_tmp_cpx
}else{
vapor_tmp0$conf = T
vapor_res[[svtype]] = vapor_tmp0
}
}
df_vapor = rbind(df_vapor,data.frame(Sample = "SM_CJEK6", SVTYPE = names(vapor_res), N = sapply(vapor_res, nrow)))
df_conf_rate = rbind(df_conf_rate, Reduce(full_join, vapor_res) %>% drop_na() %>% group_by(SVTYPE) %>% dplyr::summarise(Sample = "SM_CJEK6", avg = (sum(VaPoR_GS>0 | VaPoR_GT!="0/0")/length(VaPoR_GS)), n = n()))
df_conf_rate = rbind(df_conf_rate, Reduce(full_join, vapor_res) %>% drop_na() %>% dplyr::summarise(Sample = "SM_CJEK6", SVTYPE = "ALL", avg = (sum(VaPoR_GS>0 | VaPoR_GT!="0/0")/length(VaPoR_GS)), n = n()))
# Prepare plot
df_conf_rate$SVTYPE = as.character(df_conf_rate$SVTYPE)
df_conf_rate$SVTYPE[df_conf_rate$SVTYPE=="TANDUP"] <- "DUP"
df_conf_rate$SVTYPE = factor(df_conf_rate$SVTYPE, level = c("ALL","DEL","DUP","INS","INV"))
df_conf_rate_avg = df_conf_rate %>% group_by(SVTYPE) %>% dplyr::summarise(avg = median(avg))
c.1 <- ggplot(df_conf_rate[df_conf_rate$SVTYPE!="ALL",], aes(y = Sample, x = n, fill = SVTYPE)) +
geom_bar(stat = "identity", position = "stack", color = "black", width = .7) +
scale_fill_npg() +
theme_tufte(base_family = "Helvetica") +
scale_x_continuous(breaks = c(0, 500, 1000, 1500),
label = c("0", "500", "1k", "1.5k")) +
labs(x = "Number of SVs evaluated", y = "Sample", fill = NULL) +
easy_remove_legend()
c.2 <- ggplot(df_conf_rate, aes(y = avg, x = SVTYPE)) +
geom_point(alpha = 0.6) +
stat_summary(fun.y = median,
geom = "crossbar",
width = 0.5,
color = "#CB454A") +
geom_vline(xintercept= 1.5, colour = "gray") +
geom_text(data = df_conf_rate_avg, aes(label = scales::percent(avg), y = avg+.1),size = 3.5) +
theme_tufte(base_family = "Helvetica") +
expand_limits(y = c(0.3,1.1)) +
labs(y = "Confirmation Rate", x = "SV type")
ggarrange(c.1, c.2, heights = c(1,1.5), ncol = 1, nrow = 2)sessionInfo()## R version 4.0.3 (2020-10-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] splines stats4 parallel grid stats graphics grDevices
## [8] utils datasets methods base
##
## other attached packages:
## [1] magick_2.7.1 org.Hs.eg.db_3.12.0
## [3] GOSemSim_2.16.1 clusterProfiler_3.18.1
## [5] pipeR_0.6.1.3 FactoMineR_2.4
## [7] missMDA_1.18 RcppCNPy_0.2.10
## [9] psycho_0.6.1 cqn_1.36.0
## [11] quantreg_5.83 SparseM_1.78
## [13] preprocessCore_1.52.1 nor1mix_1.3-0
## [15] mclust_5.4.7 gdata_2.18.0
## [17] zoo_1.8-8 qqman_0.1.4
## [19] gridExtra_2.3 wesanderson_0.3.6
## [21] ggrepel_0.9.1 rlist_0.4.6.1
## [23] ComplexHeatmap_2.6.2 circlize_0.4.12
## [25] gridBase_0.4-7 patchwork_1.1.1
## [27] ggeasy_0.1.3 treemap_2.4-2
## [29] GWASdata_1.28.0 GWASTools_1.36.0
## [31] ggupset_0.3.0 boot_1.3-25
## [33] wiggleplotr_1.14.0 Gviz_1.34.1
## [35] edgeR_3.32.1 qvalue_2.22.0
## [37] ggthemes_4.2.4 factoextra_1.0.7
## [39] vroom_1.4.0 forcats_0.5.1
## [41] purrr_0.3.4 readr_1.4.0
## [43] tibble_3.0.6 tidyverse_1.3.0
## [45] compare_0.2-6 superheat_0.1.0
## [47] abind_1.4-5 splitstackshape_1.4.8
## [49] vcfR_1.12.0 ggExtra_0.9
## [51] ggalluvial_0.12.3 ggallin_0.1.1
## [53] SeqArray_1.30.0 SNPRelate_1.24.0
## [55] gdsfmt_1.26.1 hablar_0.3.0
## [57] viridis_0.5.1 viridisLite_0.3.0
## [59] pheatmap_1.0.12 ShortRead_1.48.0
## [61] GenomicAlignments_1.26.0 Rsamtools_2.6.0
## [63] ggstance_0.3.5 aod_1.3.1
## [65] readxl_1.3.1 ggsignif_0.6.0
## [67] venn_1.10 rtracklayer_1.50.0
## [69] VennDiagram_1.6.20 futile.logger_1.4.3
## [71] UpSetR_1.4.0 RColorBrewer_1.1-2
## [73] mdatools_0.11.3 robustbase_0.93-7
## [75] lmerTest_3.1-3 Kendall_2.2
## [77] Amelia_1.7.6 magrittr_2.0.1
## [79] flextable_0.6.3 gtools_3.8.2
## [81] ggfortify_0.4.11 Hmisc_4.5-0
## [83] Formula_1.2-4 skimr_2.1.3
## [85] ggbeeswarm_0.6.0 memisc_0.99.27.3
## [87] lattice_0.20-41 ggiraph_0.7.8
## [89] tables_0.9.6 DT_0.17
## [91] kableExtra_1.3.1 knitr_1.31
## [93] reshape2_1.4.4 ggsci_2.9
## [95] ggpubr_0.4.0 cowplot_1.1.1
## [97] uuid_0.1-4 gghalves_0.1.1
## [99] sitools_1.4 OUTRIDER_1.8.0
## [101] SummarizedExperiment_1.20.0 MatrixGenerics_1.2.1
## [103] matrixStats_0.58.0 GenomicFeatures_1.42.2
## [105] AnnotationDbi_1.52.0 rstatix_0.6.0
## [107] qPLEXanalyzer_1.8.2 MSnbase_2.16.1
## [109] ProtGenerics_1.22.0 mzR_2.24.1
## [111] Rcpp_1.0.6 proDA_1.4.0
## [113] GGally_2.1.1 ggridges_0.5.3
## [115] arules_1.6-7 glmnet_4.1-1
## [117] gam_1.20 arm_1.11-2
## [119] lme4_1.1-26 MASS_7.3-53
## [121] bumphunter_1.32.0 locfit_1.5-9.4
## [123] iterators_1.0.13 foreach_1.5.1
## [125] snpStats_1.40.0 Matrix_1.2-18
## [127] survival_3.2-7 TnT_1.12.0
## [129] regioneR_1.22.0 GenomicRanges_1.42.0
## [131] GenomeInfoDb_1.26.4 variancePartition_1.20.0
## [133] BiocParallel_1.24.1 limma_3.46.0
## [135] janitor_2.1.0 Biostrings_2.58.0
## [137] XVector_0.30.0 IRanges_2.24.1
## [139] S4Vectors_0.28.1 multtest_2.46.0
## [141] Biobase_2.50.0 BiocGenerics_0.36.0
## [143] stargazer_5.2.2 pander_0.6.3
## [145] stringr_1.4.0 dplyr_1.0.4
## [147] tidyr_1.1.2 data.table_1.13.6
## [149] ggplotify_0.0.5 sjPlot_2.8.7
## [151] pacman_0.5.1 proteus_0.2.14
## [153] proteusSILAC_0.1.0 proteusTMT_0.1.1
## [155] proteusLabelFree_0.1.6 ggbiplot_0.55
## [157] scales_1.1.1 plyr_1.8.6
## [159] ggmosaic_0.3.3 autoplot_0.1
## [161] broom_0.7.4 ggplot2_3.3.3
## [163] safejoin_0.1.0 seqUtils_0.0.0.9000
## [165] bsselectR_0.1.0 devtools_2.3.2
## [167] usethis_2.0.0
##
## loaded via a namespace (and not attached):
## [1] genefilter_1.72.1 sjlabelled_1.1.7 remotes_2.2.0
## [4] vctrs_0.3.6 vsn_3.58.0 blob_1.2.1
## [7] withr_2.4.1 foreign_0.8-80 gdtools_0.2.3
## [10] registry_0.5-1 rvcheck_0.1.8 lifecycle_0.2.0
## [13] emmeans_1.5.5-1 cellranger_1.1.0 munsell_0.5.0
## [16] DESeq2_1.30.1 codetools_0.2-16 seriation_1.2-9
## [19] lmtest_0.9-38 flashClust_1.01-2 DO.db_2.9
## [22] annotate_1.68.0 fs_1.5.0 fastmatch_1.1-0
## [25] impute_1.64.0 formula.tools_1.7.1 biovizBase_1.38.0
## [28] stringi_1.5.3 polyclip_1.10-0 sandwich_3.0-0
## [31] ggraph_2.0.5 cluster_2.1.0 ape_5.4-1
## [34] pkgconfig_2.0.3 prettyunits_1.1.1 lubridate_1.7.9.2
## [37] estimability_1.3 httr_1.4.2 igraph_1.2.6
## [40] progress_1.2.2 GetoptLong_1.0.5 graphlayouts_0.7.1
## [43] haven_2.3.1 snakecase_0.11.0 parameters_0.12.0
## [46] htmltools_0.5.1.1 miniUI_0.1.1.1 GWASExactHW_1.01
## [49] yaml_2.2.1 pillar_1.4.7 later_1.1.0.1
## [52] glue_1.4.2 DBI_1.1.1 affy_1.68.0
## [55] doRNG_1.8.2 gtable_0.3.0 pcaMethods_1.82.0
## [58] caTools_1.18.1 GlobalOptions_0.1.2 latticeExtra_0.6-29
## [61] fastmap_1.1.0 checkmate_2.0.0 promises_1.1.1
## [64] webshot_0.5.2 ggforce_0.3.2 hms_1.0.0
## [67] askpass_1.1 png_0.1-7 ncdf4_1.17
## [70] clue_0.3-58 lazyeval_0.2.2 crayon_1.4.0
## [73] heatmaply_1.2.1 reprex_1.0.0 MALDIquant_1.19.3
## [76] colorRamps_2.3 tidyselect_1.1.0 xfun_0.20
## [79] performance_0.7.0 VariantAnnotation_1.36.0 operator.tools_1.6.3
## [82] rappdirs_0.3.3 bit64_4.0.5 rngtools_1.5
## [85] lambda.r_1.2.4 modelr_0.1.8 affyio_1.60.0
## [88] jpeg_0.1-8.1 mzID_1.28.0 sjmisc_2.8.6
## [91] MatrixModels_0.4-1 leaps_3.1 permute_0.9-5
## [94] PRROC_1.3.1 htmlTable_2.1.0 pinfsc50_1.2.0
## [97] xtable_1.8-4 admisc_0.12 cachem_1.0.3
## [100] DelayedArray_0.16.2 vegan_2.5-7 systemfonts_1.0.1
## [103] mime_0.10 rjson_0.2.20 processx_3.4.5
## [106] insight_0.13.1 numDeriv_2016.8-1.1 tools_4.0.3
## [109] sjstats_0.18.1 cli_2.3.0 dichromat_2.0-0
## [112] assertthat_0.2.1 officer_0.3.16 logistf_1.24
## [115] fgsea_1.16.0 repr_1.1.3 DNAcopy_1.64.0
## [118] BiocFileCache_1.14.0 mgcv_1.8-33 plotly_4.9.3
## [121] tweenr_1.0.1 ensembldb_2.14.0 effectsize_0.4.4
## [124] zlibbioc_1.36.0 zip_2.1.1 hwriter_1.3.2
## [127] bayestestR_0.8.2 rio_0.5.16 shadowtext_0.0.7
## [130] biomaRt_2.46.3 geneplotter_1.68.0 ps_1.5.0
## [133] tidygraph_1.2.0 conquer_1.0.2 KernSmooth_2.23-17
## [136] backports_1.2.1 quantsmooth_1.56.0 scatterplot3d_0.3-41
## [139] farver_2.0.3 bit_4.0.4 gplots_3.1.1
## [142] openxlsx_4.2.3 shiny_1.6.0 scatterpie_0.1.5
## [145] DOSE_3.16.0 futile.options_1.0.1 dendextend_1.14.0
## [148] downloader_0.4 rstudioapi_0.13 minqa_1.2.4
## [151] AnnotationFilter_1.14.0 nlme_3.1-149 shape_1.4.5
## [154] beeswarm_0.3.1 generics_0.1.0 colorspace_2.0-0
## [157] base64enc_0.1-3 XML_3.99-0.5 pkgbuild_1.2.0
## [160] dbplyr_2.1.0 calibrate_1.7.7 GenomeInfoDbData_1.2.4
## [163] evaluate_0.14 memoise_2.0.0 coda_0.19-4
## [166] doParallel_1.0.16 vipor_0.4.5 httpuv_1.5.5
## [169] openssl_1.4.3 BiocManager_1.30.10 formatR_1.7
## [172] pkgload_1.1.0 jsonlite_1.7.2 digest_0.6.27
## [175] BBmisc_1.11 rprojroot_2.0.2 bitops_1.0-6
## [178] RSQLite_2.2.4 rmarkdown_2.6 compiler_4.0.3
## [181] nnet_7.3-14 statmod_1.4.35 carData_3.0-4
## [184] testthat_3.0.1 gridGraphics_0.5-1 rlang_0.4.10
## [187] nloptr_1.2.2.2 sessioninfo_1.1.1 rvest_0.3.6
## [190] mvtnorm_1.1-1 htmlwidgets_1.5.3 labeling_0.4.2
## [193] callr_3.5.1 Cairo_1.5-12.2 curl_4.3
## [196] pbkrtest_0.5-0.1 highr_0.8 DEoptimR_1.0-8
## [199] BSgenome_1.58.0 desc_1.2.0 ggeffects_1.0.2
## [202] enrichplot_1.10.2 RCurl_1.98-1.3 mice_3.13.0
## [205] ggdendro_0.1.22 GO.db_3.12.1 car_3.0-10
## [208] ellipsis_0.3.1 xml2_1.3.2 reshape_0.8.8
## [211] rpart_4.1-15 R6_2.5.0 TSP_1.1-10